Data Preprocessing#
Student namen: Laiba Shamsul, Popke Snoek, Yoshi Fu, Pepeyn Velthuijse
Team nummer: G4
The following code is used to preprocess the datasets into files that are used for the data story. The code may raise an error due to the dataset being very large.
Note that it requires the dataset to be downloaded and inserted into the same folder as this notebook. You may find visualizations that did not make it into the actual data story. The dataset is too big to put on git, git lfs or for pandas.read_csv() to read using a share link on google drive/onedrive.
The entire cleaned dataset can be downloaded from: https://drive.google.com/file/d/1sTGPzVfk017Y8n2KPgsIxP9eXdl02YgH/view
If you want to see the code that made the cleaned dataset, then look at the ‘clean.py’ file in the github repository. FuYoshi/data_story_project
# Import packages
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
# Read the big dataset in chunks.
chunk_size = 1000000
dtypes = {"Country": str, "Mk": str, "Cn": str, "m (kg)": float, "Enedc (g/km)": float, "Ewltp (g/km)": float, "W (mm)": float, "Ft": str, "Ernedc (g/km)": float, "Erwltp (g/km)": float, "year": int}
chunk_container = pd.read_csv("CO2_data.csv", dtype=dtypes, chunksize=chunk_size)
# Split the dataset by year.
# This ensures that the kernel does not die due to memory shortage and some plots only require data of 2021.
co2_2015 = pd.DataFrame()
co2_2016 = pd.DataFrame()
co2_2017 = pd.DataFrame()
co2_2018 = pd.DataFrame()
co2_2019 = pd.DataFrame()
co2_2020 = pd.DataFrame()
co2_2021 = pd.DataFrame()
for chunk in chunk_container:
co2_2015 = pd.concat([co2_2015, chunk[chunk["year"] == 2015]], ignore_index=True)
co2_2016 = pd.concat([co2_2016, chunk[chunk["year"] == 2016]], ignore_index=True)
co2_2017 = pd.concat([co2_2017, chunk[chunk["year"] == 2017]], ignore_index=True)
co2_2018 = pd.concat([co2_2018, chunk[chunk["year"] == 2018]], ignore_index=True)
co2_2019 = pd.concat([co2_2019, chunk[chunk["year"] == 2019]], ignore_index=True)
co2_2020 = pd.concat([co2_2020, chunk[chunk["year"] == 2020]], ignore_index=True)
co2_2021 = pd.concat([co2_2021, chunk[chunk["year"] == 2021]], ignore_index=True)
co2_2021.head()
| Cn | Country | Enedc (g/km) | Ernedc (g/km) | Erwltp (g/km) | Ewltp (g/km) | Ft | Mk | W (mm) | m (kg) | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MITSUBISHI OUTLANDER | IS | 40.0 | NaN | NaN | 46.0 | petrol-electric | Mitsubishi | 2670.0 | 1965.0 | 2021 |
| 1 | MITSUBISHI OUTLANDER | IS | 40.0 | NaN | NaN | 46.0 | petrol-electric | Mitsubishi | 2670.0 | 1965.0 | 2021 |
| 2 | MITSUBISHI OUTLANDER | IS | 40.0 | NaN | NaN | 46.0 | petrol-electric | Mitsubishi | 2670.0 | 1965.0 | 2021 |
| 3 | MITSUBISHI OUTLANDER | IS | 40.0 | NaN | NaN | 46.0 | petrol-electric | Mitsubishi | 2670.0 | 1965.0 | 2021 |
| 4 | MITSUBISHI OUTLANDER | IS | 40.0 | NaN | NaN | 46.0 | petrol-electric | Mitsubishi | 2670.0 | 1965.0 | 2021 |
# Set to true to convert data to csv. False otherwise.
to_csv = False
# Compute data for bar graph with average CO2 emission per country
country_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
# Get subset of columns.
df = df[["Country", "year", "Ewltp (g/km)", "Erwltp (g/km)"]]
# Group by country and year and compute mean and std.
df = df.groupby(["Country", "year"]).agg({"Ewltp (g/km)": ['mean', 'sum'], "Erwltp (g/km)": 'mean'}).reset_index()
df.columns = df.columns.map(' '.join).str.strip()
country_emission = pd.concat([country_emission, df], ignore_index=True)
if to_csv:
country_emission.to_csv("country_emission.csv")
print(country_emission)
Country year Ewltp (g/km) mean Ewltp (g/km) sum Erwltp (g/km) mean
0 AT 2015 167.352565 9610053.7 1.740408
1 BE 2015 168.882209 4642909.7 2.788953
2 BG 2015 179.490853 665193.1 3.076667
3 CY 2015 173.255686 251394.0 NaN
4 CZ 2015 182.923737 2269534.8 1.707770
.. ... ... ... ... ...
195 PT 2021 107.297623 15676290.0 1.515715
196 RO 2021 126.562358 15296959.4 1.372131
197 SE 2021 90.271425 26249034.6 1.361123
198 SI 2021 133.912345 6943489.0 1.573788
199 SK 2021 139.223296 10530989.3 1.396190
[200 rows x 5 columns]
# Plot settings
rows = 2
cols = 1
fig = make_subplots(rows=rows, cols=cols, x_title="Landcode", y_title="CO2 emissie in WLTP (g/km)")
for year in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
# Find index of 'IE' to split the countries on ('IE' is often in the middle).
country_emission_year = country_emission[country_emission["year"]==year].reset_index()
split_index = country_emission_year[country_emission_year["Country"]=="IE"].index
country_emission_split = np.split(country_emission[country_emission["year"]==year], split_index)
# Subplot settings
showlegend = True
for row in range(rows):
for col in range(cols):
index = cols * row + col
fig.append_trace(
go.Bar(
x=country_emission_split[index]["Country"],
y=country_emission_split[index]["Ewltp (g/km) mean"],
name=year,
marker=dict(color=px.colors.qualitative.Plotly[year-2015]),
showlegend=showlegend,
),
row=row + 1,
col=col + 1,
)
showlegend = False
fig.update_layout(
title_text="Visualisatie 1: CO2-uitstoot door personenauto's per land in de EU tussen 2015 en 2021",
legend_title="Jaar",
)
fig.show()
# Plot settings
rows = 2
cols = 1
fig = make_subplots(rows=rows, cols=cols, x_title="Landcode", y_title="CO2 emissie reductie in WLTP (g/km)")
for year in [2015, 2016, 2017, 2018, 2019, 2020, 2021]:
# Find index of 'IE' to split the countries on ('IE' is often in the middle).
country_emission_year = country_emission[country_emission["year"]==year].reset_index()
split_index = country_emission_year[country_emission_year["Country"]=="IE"].index
country_emission_split = np.split(country_emission[country_emission["year"]==year], split_index)
# Subplot settings
for row in range(rows):
for col in range(cols):
index = cols * row + col
showlegend = row == col == 0
fig.append_trace(
go.Bar(
x=country_emission_split[index]["Country"],
y=country_emission_split[index]["Erwltp (g/km) mean"],
name=year,
marker=dict(color=px.colors.qualitative.Plotly[year-2015]),
showlegend=showlegend,
),
row=row + 1,
col=col + 1,
)
fig.update_layout(
title_text="Visualisatie 2: Gemiddelde CO2-uitstoot vermindering vanwege innovatieve technologieën per land in de EU tussen 2019 en 2021",
legend_title="Jaar",
)
fig.show()
# Compute pie chart with average CO2 emission per fuel type.
ft_mean_emission = co2_2021.groupby(co2_2021["Ft"]).agg({"Ewltp (g/km)": 'mean'})
# ft_mean_emission = co2_2021.groupby(co2_2021["Ft"])["Ewltp (g/km)"].mean()
ft_mean_emission = ft_mean_emission.sort_values(by="Ewltp (g/km)", ascending=False)
if to_csv:
ft_mean_emission.to_csv("ft_mean_emission.csv")
print(ft_mean_emission)
Ewltp (g/km)
Ft
e85 159.597920
diesel 144.489682
petrol 134.432168
other 123.094150
lpg 121.112130
ng-biomethane 113.262629
ng 104.922170
petrol-electric 41.888312
diesel-electric 39.109638
electric 0.000000
# Plot pie chart with average CO2 emission per fuel type.
fig = px.pie(ft_mean_emission,
names=ft_mean_emission.index,
values="Ewltp (g/km)",
color=ft_mean_emission.index,
title="CO2 emission by fuel type in 2021",
hole=0.8,
labels={
"Ewltp (g/km)": "CO2 emission WLTP (g/km)"
},
)
fig.update_layout(showlegend=False)
fig.update_traces(textposition='outside', textinfo="label + percent")
fig.update_layout(
updatemenus=[
dict(
type = "buttons",
direction = "left",
buttons=list([
dict(
args=["type", "pie"],
label="Pie",
method="restyle"
),
dict(
args=["type", "bar"],
label="Bar",
method="restyle"
)
]),
showactive=True,
x=0.11,
xanchor="left",
y=1.15,
yanchor="top"
),
]
)
# Add annotation
fig.update_layout(
annotations=[
dict(text="Trace type:", showarrow=False, x=0, y=1.12, yref="paper", align="left")
]
)
fig.show()
fig = make_subplots(specs=[[{"secondary_y": True}]])
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
eu_emission = country_emission.groupby("year").sum()
fig.add_trace(
go.Scatter(x=years, y=eu_emission["Ewltp (g/km) mean"], name="Gemiddelde"),
secondary_y=False,
)
fig.add_trace(
go.Scatter(x=years, y=eu_emission["Ewltp (g/km) sum"], name="Totaal"),
secondary_y=True,
)
fig.update_layout(
title_text="CO2 emissie in de EU",
xaxis_title="Jaar",
)
fig.update_yaxes(title_text="Gemiddelde CO2 emissie in WLTP (g/km)", secondary_y=False)
fig.update_yaxes(title_text="Totale CO2 emissie in WLTP (g/km)", secondary_y=True)
fig.show()
# Lijst om de resultaten in op te slaan
counts_results = []
def classify_electric(x: str):
if "-electric" in x:
return "Semi-Elektrisch"
elif "electric" in x:
return "Elektrisch"
else:
return "Anders"
# Itereer over elke chunk
for chunk in pd.read_csv("CO2_data.csv", chunksize=1000000):
# Classificeer elektrische en hybride auto's als 'Elektrisch', en alle anderen als 'Anders'
chunk['Ft'] = chunk['Ft'].apply(classify_electric)
# chunk['Ft'] = chunk['Ft'].apply(lambda x: 'Elektrisch' if 'electric' in x or 'hybrid' in x else 'Anders')
# Doe de berekeningen
count_result = chunk.groupby(['year', 'Ft'], group_keys=False).size().reset_index(name='counts')
# Voeg het resultaat toe aan de lijst
counts_results.append(count_result)
# Concateneer alle resultaten in een dataframe
df_counts = pd.concat(counts_results)
# Splits bij jaar en brandstoftype
df_counts = df_counts.groupby(["year", "Ft"]).agg({"counts": 'sum'}).reset_index()
# Bereken totale emissie per jaar.
df_counts["sum"] = df_counts.groupby("year")["counts"].transform('sum')
# Bereken de percentages
df_counts["percent"] = 100 * df_counts["counts"] / df_counts["sum"]
if to_csv:
df_counts.to_csv("fuel_type_distribution.csv")
print(df_counts)
year Ft counts sum percent
0 2015 Anders 415321 419369 99.034740
1 2015 Elektrisch 1899 419369 0.452823
2 2015 Semi-Elektrisch 2149 419369 0.512437
3 2016 Anders 473519 478728 98.911908
4 2016 Elektrisch 2452 478728 0.512191
5 2016 Semi-Elektrisch 2757 478728 0.575901
6 2017 Anders 4859439 4935537 98.458162
7 2017 Elektrisch 43266 4935537 0.876622
8 2017 Semi-Elektrisch 32832 4935537 0.665216
9 2018 Anders 14598778 14898362 97.989148
10 2018 Elektrisch 148184 14898362 0.994633
11 2018 Semi-Elektrisch 151400 14898362 1.016219
12 2019 Anders 7975216 9793017 81.437784
13 2019 Elektrisch 980563 9793017 10.012880
14 2019 Semi-Elektrisch 837238 9793017 8.549337
15 2020 Anders 7975216 9793017 81.437784
16 2020 Elektrisch 980563 9793017 10.012880
17 2020 Semi-Elektrisch 837238 9793017 8.549337
18 2021 Anders 7975216 9793017 81.437784
19 2021 Elektrisch 980563 9793017 10.012880
20 2021 Semi-Elektrisch 837238 9793017 8.549337
# Definieer kleuren voor de brandstof types
colormap = {
"Elektrisch": 'blue',
"Semi-Elektrisch": 'purple',
"Anders": 'grey',
}
# Plot settings
fig = px.bar(df_counts,
x="year",
y="percent",
color="Ft",
title="Brandstof type distributie van personenauto's in de EU tussen 2015 en 2021",
labels={
"percent": "Percentage van alle personenauto's",
"year": 'Jaar',
"Ft": 'Brandstof type',
},
hover_data=['counts'],
color_discrete_map=colormap # Gebruik de kleurenkaart
)
fig.show()
total_emission_per_ft = co2_2021.groupby('Ft').agg({"Ewltp (g/km)": ['sum', 'size']})
total_emission_per_ft.columns = total_emission_per_ft.columns.map(' '.join).str.strip()
total_emission_per_ft = total_emission_per_ft.rename(columns={
"Ewltp (g/km) sum": "Ewltp (g/km)",
"Ewltp (g/km) size": "n",
})
total_emission_per_ft = total_emission_per_ft.sort_values(by="Ewltp (g/km)", ascending=False).reset_index()
if to_csv:
total_emission_per_ft.to_csv("total_emission_per_ft.csv")
print(total_emission_per_ft)
# Plot settings
fig = px.bar(total_emission_per_ft,
x="Ft",
y="Ewltp (g/km)",
title="Totale CO2-uitstoot van personenauto's in de EU per brandstof type (2021)",
custom_data=[total_emission_per_ft["n"]],
labels={
"Ewltp (g/km)": "CO2 emissie in WLTP (g/km)",
"Ft": "Brandstof type",
}
)
fig.update_traces(hovertemplate="CO2 emissie in WLTP (g/km)=%{y}<br>Aantal auto's=%{customdata[0]}")
fig.show()
Ft Ewltp (g/km) n
0 petrol 735257118.5 5469354
1 diesel 322991801.5 2235397
2 petrol-electric 32944236.2 786478
3 lpg 26007982.1 214743
4 ng 4101197.8 39088
5 diesel-electric 1985205.2 50760
6 e85 1150701.0 7210
7 ng-biomethane 864080.6 7629
8 other 220954.0 1795
9 electric 0.0 980563
# Compute data for emission per brand per year.
make_emission = pd.DataFrame()
for df in [co2_2015, co2_2016, co2_2017, co2_2018, co2_2019, co2_2020, co2_2021]:
# Get subset of columns.
df = df[["year", "Mk", "Ft", "Ewltp (g/km)"]]
# Group by year and brand and compute mean.
df = df.groupby(["year", "Mk"]).agg({"Ewltp (g/km)": 'mean', "Ft": 'first'}).reset_index()
make_emission = pd.concat([make_emission, df], ignore_index=True)
print(make_emission)
year Mk Ewltp (g/km) Ft
0 2015 Alfa Romeo 158.938858 diesel
1 2015 Alpina 233.141509 petrol
2 2015 Aston Martin 410.137643 petrol
3 2015 Audi 177.115525 petrol
4 2015 BMW 179.843678 diesel
.. ... ... ... ...
490 2021 UAZ 276.000000 petrol
491 2021 Volkswagen 118.445517 diesel
492 2021 Volvo 110.584278 petrol-electric
493 2021 ZD 0.000000 electric
494 2021 Škoda 126.702169 petrol
[495 rows x 4 columns]
car_emission_mass = co2_2021.sample(n=100000, replace=False)
car_emission_mass = car_emission_mass[["Ft", "Ewltp (g/km)", "m (kg)"]]
if to_csv:
car_emission_mass.to_csv("car_emission_mass.csv")
# Plot settings
fig = px.scatter(car_emission_mass,
x="m (kg)",
y="Ewltp (g/km)",
color="Ft",
)
# Create toggle visibility button for every fuel type.
buttons = []
for i, val in enumerate(car_emission_mass["Ft"].unique()):
button = dict(method='restyle',
label=val,
args=[{'visible': True}, [i]],
args2=[{'visible': "legendonly"}, [i]],
)
buttons.append(button)
# Menu settings
fig.update_layout(
updatemenus=[
dict(
buttons=buttons,
visible=False,
),
],
annotations=[
dict(text="Toggle traces by clicking the legend", x=0, xref="paper", y=1.05, yref="paper", align="right", showarrow=False),
],
title_text="A really interesting title",
legend_title="Brandstof type",
xaxis_title="Gewicht (kg)",
xaxis_range=(0, 3500),
yaxis_title="CO2 emissie in WLTP (g/km)",
yaxis_range=(0, 550),
)
fig.show()
car_emission = pd.read_csv("car_emission.csv")
df_split = np.array_split(car_emission, 8)
# Split data into different parts.
plot1 = px.scatter(df_split[0], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['indianred', 'limegreen', 'aqua'], hover_data=[df_split[0]['Ft']])
plot2 = px.scatter(df_split[1], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['darkmagenta', 'darkolivegreen', 'darkorange'], hover_data=[df_split[1]['Ft']])
plot3 = px.scatter(df_split[2], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['deeppink', 'deepskyblue', 'dimgray'], hover_data=[df_split[2]['Ft']])
plot4 = px.scatter(df_split[3], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['forestgreen', 'fuchsia', 'gainsboro'], hover_data=[df_split[3]['Ft']])
plot5 = px.scatter(df_split[4], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['lavenderblush', 'lawngreen', 'lemonchiffon'], hover_data=[df_split[4]['Ft']])
plot6 = px.scatter(df_split[5], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['magenta', 'maroon', 'mediumaquamarine'], hover_data=[df_split[5]['Ft']])
plot7 = px.scatter(df_split[6], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['midnightblue', 'goldenrod', 'mediumseagreen'], hover_data=[df_split[6]['Ft']])
plot8 = px.scatter(df_split[7], x='year', y='Enedc (g/km) mean', color='Mk', color_discrete_sequence =['orangered', 'orchid', 'palegoldenrod'], hover_data=[df_split[7]['Ft']])
fig = make_subplots(rows=3, cols=3, shared_yaxes=True, x_title="Jaar", y_title="CO2 emission in NEDC (g/km)")
# Add the trace of each part to the plot.
fig.add_trace(plot1['data'][0], row=1, col=1)
fig.add_trace(plot1['data'][1], row=1, col=1)
fig.add_trace(plot1['data'][2], row=1, col=1)
fig.append_trace(plot2['data'][0], row=1, col=2)
fig.append_trace(plot2['data'][1], row=1, col=2)
fig.append_trace(plot2['data'][2], row=1, col=2)
fig.append_trace(plot3['data'][0], row=1, col=3)
fig.append_trace(plot3['data'][1], row=1, col=3)
fig.append_trace(plot3['data'][2], row=1, col=3)
fig.append_trace(plot4['data'][0], row=2, col=1)
fig.append_trace(plot4['data'][1], row=2, col=1)
fig.append_trace(plot4['data'][2], row=2, col=1)
fig.append_trace(plot5['data'][0], row=2, col=2)
fig.append_trace(plot5['data'][1], row=2, col=2)
fig.append_trace(plot5['data'][2], row=2, col=2)
fig.append_trace(plot6['data'][0], row=2, col=3)
fig.append_trace(plot6['data'][1], row=2, col=3)
fig.append_trace(plot6['data'][2], row=2, col=3)
fig.append_trace(plot7['data'][0], row=3, col=1)
fig.append_trace(plot7['data'][1], row=3, col=1)
fig.append_trace(plot7['data'][2], row=3, col=1)
fig.append_trace(plot8['data'][0], row=3, col=2)
fig.append_trace(plot8['data'][1], row=3, col=2)
fig.append_trace(plot8['data'][2], row=3, col=2)
# Plot settings
fig.update_layout(
showlegend=True,
height=800,
width=800,
title_text="Gemiddelde CO2-uitstoot van personenauto's per merk tussen 2015 en 2021",
legend_title="Automerk",
)
fig.update_xaxes(tickangle=-45, categoryarray=[2015, 2016, 2017, 2018, 2019, 2020, 2021], type='category')
fig.show()